Raw data

- Same raw data from my previous project

- Salmon re-mapping with newly built decoys and indexing files

Loading packages

library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(ggplotify)
library(ggrepel)
library(UpSetR)

Setting AnnotationHub

AnnotationSpecies <- "Homo sapiens"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))   # Bring annotation DB

Running AnnotationHub

ahQuery <- query(ah, c("OrgDb", AnnotationSpecies))      # Filter annotation of interest
if (length(ahQuery) == 1) {
    DBName <- names(ahQuery)
} else if (length(ahQuery) > 1) {
               DBName <- names(ahQuery)[1]
} else {
    print("You don't have a valid DB")
    rmarkdown::render() 
} 
AnnoDb <- ah[[DBName]] # Store into an OrgDb object  
# Explore your OrgDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="ENSEMBLTRANS")
# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="ENSEMBLTRANS",
                 columns="SYMBOL")

Checking out the AnnotationHub output

# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
##      ENSEMBLTRANS SYMBOL
## 1 ENST00000543404  A2MP1
## 2 ENST00000566278  A2MP1
## 3 ENST00000545343  A2MP1
## 4 ENST00000544183  A2MP1
## 5 ENST00000286479   NAT2
## 6 ENST00000520116   NAT2

Defining file name and path for .sf files

.sf files have been created from fastq data by salmon

# This code chunk needs to be written by yourself 
# Define sample names 
SampleNames <-  c("Mock_72hpi_S1",
                 "Mock_72hpi_S2",
                 "Mock_72hpi_S3",
                 "SARS-CoV-2_72hpi_S7",
                 "SARS-CoV-2_72hpi_S8",
                 "SARS-CoV-2_72hpi_S9") 
# Define group level
GroupLevel <- c("Mock", "COVID")
# Define contrast for DE analysis
Contrast <- c("Group", "COVID", "Mock")
# Define a vector for comparing TPM vs Counts effect 
TvC <- c("TPM", "Counts")
# Define .sf file path
sf <- c(paste0(SampleNames,
               ".salmon_quant/quant.sf"))
# Define sample groups
group <- c(rep("Mock", 3), rep("COVID", 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Path=sf)
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
##                                  Sample Group
## Mock_72hpi_S1             Mock_72hpi_S1  Mock
## Mock_72hpi_S2             Mock_72hpi_S2  Mock
## Mock_72hpi_S3             Mock_72hpi_S3  Mock
## SARS-CoV-2_72hpi_S7 SARS-CoV-2_72hpi_S7 COVID
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S8 COVID
## SARS-CoV-2_72hpi_S9 SARS-CoV-2_72hpi_S9 COVID
##                                                          Path
## Mock_72hpi_S1             Mock_72hpi_S1.salmon_quant/quant.sf
## Mock_72hpi_S2             Mock_72hpi_S2.salmon_quant/quant.sf
## Mock_72hpi_S3             Mock_72hpi_S3.salmon_quant/quant.sf
## SARS-CoV-2_72hpi_S7 SARS-CoV-2_72hpi_S7.salmon_quant/quant.sf
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S8.salmon_quant/quant.sf
## SARS-CoV-2_72hpi_S9 SARS-CoV-2_72hpi_S9.salmon_quant/quant.sf

Converting .sf files to txi list

- txi_tpm: stores TPM with the argument “countsFromAbundance=”lengthScaledTPM"

- txi_counts: stores original counts

- In this project, two txi objects were created with or without the countsFromAbundance=“lengthScaledTPM” argument and compared in downstream DE analysis.

- If you don’t want gene-level summarization, set txOut=TRUE.

- References: tximport doc, DESeq2 doc “Why unnormalized counts?”, Soneson et al. 2016, Developer Dr. Love’s comment, Harvard Chan Bioinformatics Core workshop

# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames
# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    countsFromAbundance="lengthScaledTPM", # Extracts TPM 
                    ignoreTxVersion=T) 
txi_counts <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    ignoreTxVersion=T) 

Exploring the txi outputs

# tpm 
head(txi_tpm$counts)
##         Mock_72hpi_S1 Mock_72hpi_S2 Mock_72hpi_S3 SARS-CoV-2_72hpi_S7
## A2MP1        5.077648      5.609035     0.8096504             0.00000
## A3GALT2      0.000000      0.000000     0.0000000             0.00000
## A4GNT        0.000000      2.974963     1.0038198             0.00000
## AACSP1      14.891510     19.954031     9.2490944            19.26746
## AADACL2      0.000000      0.000000     0.0000000             0.00000
## AADACL4      1.013374      0.000000     2.0033163             0.00000
##         SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9
## A2MP1               0.00000           7.1022218
## A3GALT2             0.00000           0.0000000
## A4GNT               0.00000           0.9959273
## AACSP1             17.46992           4.5607796
## AADACL2             0.00000           0.0000000
## AADACL4             1.98163           0.0000000
dim(txi_tpm$counts)
## [1] 12202     6
# counts
head(txi_counts$counts)
##         Mock_72hpi_S1 Mock_72hpi_S2 Mock_72hpi_S3 SARS-CoV-2_72hpi_S7
## A2MP1           6.186             7             1                   0
## A3GALT2         0.000             0             0                   0
## A4GNT           0.000             3             1                   0
## AACSP1         16.000            22            10                  21
## AADACL2         0.000             0             0                   0
## AADACL4         1.000             0             2                   0
##         SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9
## A2MP1                     0                   3
## A3GALT2                   0                   0
## A4GNT                     0                   1
## AACSP1                   10                   5
## AADACL2                   0                   0
## AADACL4                   2                   0
dim(txi_counts$counts)
## [1] 12202     6

Creating DESeq objects from txi and VST

- Note: The tximport-to-DESeq2 approach uses estimated gene counts from the transcript abundance quantifiers, but not normalized counts.

- The DESeqDataSetFromTximport() function generated an DESeq object (aka dds) with the txi input.

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) { 
    # Create a DESeq object (so-calledd dds) 
    des <- DESeqDataSetFromTximport(txi, 
                                    colData=metadata,
                                    design=~Group)
    # Create a vsd object (so-called vsd) 
    ves <- vst(des, blind=T)
    # Output them as a list 
    return(list(dds=des, vsd=ves))
}
TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']] 
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]

Exploring created dds

# TPM 
TPM[[1]]
## class: DESeqDataSet 
## dim: 12202 6 
## metadata(1): version
## assays(1): counts
## rownames(12202): A2MP1 A3GALT2 ... ZXDB ZYXP1
## rowData names(0):
## colnames(6): Mock_72hpi_S1 Mock_72hpi_S2 ... SARS-CoV-2_72hpi_S8
##   SARS-CoV-2_72hpi_S9
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
##         Mock_72hpi_S1 Mock_72hpi_S2 Mock_72hpi_S3 SARS-CoV-2_72hpi_S7
## A2MP1               5             6             1                   0
## A3GALT2             0             0             0                   0
## A4GNT               0             3             1                   0
## AACSP1             15            20             9                  19
## AADACL2             0             0             0                   0
## AADACL4             1             0             2                   0
##         SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9
## A2MP1                     0                   7
## A3GALT2                   0                   0
## A4GNT                     0                   1
## AACSP1                   17                   5
## AADACL2                   0                   0
## AADACL4                   2                   0
# Counts
Counts[[1]]
## class: DESeqDataSet 
## dim: 12202 6 
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(12202): A2MP1 A3GALT2 ... ZXDB ZYXP1
## rowData names(0):
## colnames(6): Mock_72hpi_S1 Mock_72hpi_S2 ... SARS-CoV-2_72hpi_S8
##   SARS-CoV-2_72hpi_S9
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
##         Mock_72hpi_S1 Mock_72hpi_S2 Mock_72hpi_S3 SARS-CoV-2_72hpi_S7
## A2MP1               6             7             1                   0
## A3GALT2             0             0             0                   0
## A4GNT               0             3             1                   0
## AACSP1             16            22            10                  21
## AADACL2             0             0             0                   0
## AADACL4             1             0             2                   0
##         SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9
## A2MP1                     0                   3
## A3GALT2                   0                   0
## A4GNT                     0                   1
## AACSP1                   10                   5
## AADACL2                   0                   0
## AADACL4                   2                   0

Exploring created vsd

# TPM
TPM[[2]]
## class: DESeqTransform 
## dim: 12202 6 
## metadata(1): version
## assays(1): ''
## rownames(12202): A2MP1 A3GALT2 ... ZXDB ZYXP1
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock_72hpi_S1 Mock_72hpi_S2 ... SARS-CoV-2_72hpi_S8
##   SARS-CoV-2_72hpi_S9
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform 
## dim: 12202 6 
## metadata(1): version
## assays(1): ''
## rownames(12202): A2MP1 A3GALT2 ... ZXDB ZYXP1
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock_72hpi_S1 Mock_72hpi_S2 ... SARS-CoV-2_72hpi_S8
##   SARS-CoV-2_72hpi_S9
## colData names(3): Sample Group Path

Estimating size factors, dispersions, and conducting Wald Test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- Messages when “Counts <- DESeqPrep_fn(Counts)” was run:

using ‘avgTxLength’ from assays(dds), correcting for library size gene-wise dispersion estimates mean-dispersion relationship final dispersion estimates

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
    
    List[[1]] <- estimateSizeFactors(List[[1]])
    List[[1]] <- estimateDispersions(List[[1]])
    List[[1]] <- nbinomWaldTest(List[[1]])
   
    return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts) 
TPM <- DESeqPrep_fn(TPM)

Exploring size factors

sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
##       Mock_72hpi_S1       Mock_72hpi_S2       Mock_72hpi_S3 SARS-CoV-2_72hpi_S7 
##           1.1075601           1.0513174           0.8852784           1.3664601 
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S9 
##           0.7661170           1.0051344
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead! 
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
##                                  Sample    Group                   Path
##                                <factor> <factor>            <character>
## Mock_72hpi_S1       Mock_72hpi_S1          Mock  Mock_72hpi_S1.salmon..
## Mock_72hpi_S2       Mock_72hpi_S2          Mock  Mock_72hpi_S2.salmon..
## Mock_72hpi_S3       Mock_72hpi_S3          Mock  Mock_72hpi_S3.salmon..
## SARS-CoV-2_72hpi_S7 SARS-CoV-2_72hpi_S7    COVID SARS-CoV-2_72hpi_S7...
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S8    COVID SARS-CoV-2_72hpi_S8...
## SARS-CoV-2_72hpi_S9 SARS-CoV-2_72hpi_S9    COVID SARS-CoV-2_72hpi_S9...
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
##                                  Sample    Group                   Path
##                                <factor> <factor>            <character>
## Mock_72hpi_S1       Mock_72hpi_S1          Mock  Mock_72hpi_S1.salmon..
## Mock_72hpi_S2       Mock_72hpi_S2          Mock  Mock_72hpi_S2.salmon..
## Mock_72hpi_S3       Mock_72hpi_S3          Mock  Mock_72hpi_S3.salmon..
## SARS-CoV-2_72hpi_S7 SARS-CoV-2_72hpi_S7    COVID SARS-CoV-2_72hpi_S7...
## SARS-CoV-2_72hpi_S8 SARS-CoV-2_72hpi_S8    COVID SARS-CoV-2_72hpi_S8...
## SARS-CoV-2_72hpi_S9 SARS-CoV-2_72hpi_S9    COVID SARS-CoV-2_72hpi_S9...
##                     sizeFactor
##                      <numeric>
## Mock_72hpi_S1         1.107560
## Mock_72hpi_S2         1.051317
## Mock_72hpi_S3         0.885278
## SARS-CoV-2_72hpi_S7   1.366460
## SARS-CoV-2_72hpi_S8   0.766117
## SARS-CoV-2_72hpi_S9   1.005134

Plotting the size factors in TPM

- The size factors are only available from TPM dds

- Blue dashed line: normalization factor = 1

# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))
colnames(sizeFactor) <- 'Size_Factor'
sizeFactor <- sizeFactor %>%
    rownames_to_column(var="Sample") %>%
    inner_join(metadata[, 1:ncol(metadata)-1], by="Sample") 
# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample, 
                       y=Size_Factor, 
                       fill=Group,
                       label=Size_Factor)) +
    geom_bar(stat="identity", width=0.8) +
    theme_bw() + 
    ggtitle("Size Factors in TPM-DESeq") +
    geom_text(vjust=1.5) +
    theme(axis.text.x=element_text(angle=45, 
                                   vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")

Plotting nornalization factors in the Counts

- DESeq2 performs an internal normalization where geometric mean is calculated for each gene across all samples. The counts for a gene in each sample is then divided by this mean. The median of these ratios in a sample is the size factor for that sample.

- Blue dashed line: normalization factor = 1

- Colored dots: normlization factors per gene (y-axis) in each sample

- Box plots: distribution of the normalization facters in each group (see the second plot)

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
    gather(Sample, Normalization_Factor) %>%
    inner_join(metadata[, 1:2], by="Sample") 
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors 
normFactors_plot <- ggplot(normf, 
       aes(x=Sample, y=Normalization_Factor)) + 
geom_jitter(alpha=0.5, aes(color=Group)) + 
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor), 
             outlier.shape=NA, alpha=0.5) + 
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45, 
                               vjust=0.5)) + 
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1, 
           color="blue", 
           linetype="dashed")
# Print the normalization factor scatter plot 
print(normFactors_plot)

# Print the same plot with larger y-magnification in order to observe the box plot 
normFactors_plot + 
    ylim(0.5, 1.5)

Setting functions for QC plots

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1] 
# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {
    plotPCA(inputList[[2]],    # takes vsd
            intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}
# Set heatmap annotation 
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]
# Set a function for a correlation heatmap 
QCcorrHeatmap_fn <- function(inputList, Title) {
    # Extract transformed count matrix
    mtx <- assay(inputList[[2]])      # takes vsd
    # Calculate correlation and store in the matrix
    mtx <- cor(mtx)
    
    
    # Create a correlation heatmap
    return(pheatmap(mtx, 
             annotation=HeatmapAnno,
             main=paste("Sample Correlation Heatmap:",
                        Title)))
}

Sample QC: Principal Component Analysis

- Checkpoints in PCA: source of variation, sample outlier

grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"), 
             QCPCA_fn(Counts, "QC PCA: Counts"), 
             nrow=2)

Sample QC: Sample Correlation Heatmap

- Checkpoints of correlation heatmap: distance between samples, correlation in a group

- Upper: TPM input

- Lower: Counts input

# TPM
QCcorrHeatmap_fn(TPM, "TPM") 

# Counts
QCcorrHeatmap_fn(Counts, "Counts") 

Running DE analysis

# Create a list for TPM and Counts dds 
ddsList <- list(TPM=TPM[[1]], Counts=Counts[[1]]) 
for (x in TvC) {
    # Run DESeq() 
    ddsList[[x]] <- DESeq(ddsList[[x]])
    print(resultsNames(ddsList[[x]]))
}
## [1] "Intercept"           "Group_COVID_vs_Mock"
## [1] "Intercept"           "Group_COVID_vs_Mock"

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Plot dispersion  
for (x in TvC) {
    plotDispEsts(ddsList[[x]], 
                 ylab="Dispersion", 
                 xlab="Mean of Normalized Counts", 
                 main=paste("Dispersion of", x, "Input"))
}

Shrinking effect size

- Shrinkage reduces false positives

- Magnitude of shrinkage is affected by dispersion and sample size

- When the argument type is set to “apeglm”, the coef argument is used instead of constrast. In this dataset, you can set coef=Coef where Coef <- resultsNames(ddsList[[1]]).

- When the type is set to “normal”, the argument contrast is set as shown below.

- References: DESeq2 doc “Alternative shrinkage estimators”, Harvard Chan Bioinformatics Core workshop

# Create an empty list for shrunken dds
shr_ddsList <- list(TPM=c(), Counts=c()) 
for (x in TvC) {
    # shrink
    shr_ddsList[[x]] <- lfcShrink(ddsList[[x]], 
                                  contrast=Contrast, # contrast  
                                  type="normal")     # is paired with "normal" type
}

Extracting log2FoldChange and p-values with or without shrinkage

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold 
alpha=0.1 
# FDR threshold vector
FDRv=c("< 0.1", "> 0.1") 
# Set a function cleaning table
Sig_fn <- function(df, Input) {
    df <- df %>% 
        rownames_to_column(var="Gene") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   FDRv[1], 
                                   FDRv[2]), 
               Input=Input) 
    return(df)
}
# Initialize lists of result tables with (resList) or without (shr_resList) shrinkage
resList <- ddsList 
shr_resList <- ddsList  
for (x in TvC) {
    # Extract results
    resList[[x]] <- as.data.frame(results(ddsList[[x]], 
                                          contrast=Contrast, 
                                          alpha=alpha))
    shr_resList[[x]] <- as.data.frame(shr_ddsList[[x]])
    # clean the data frame
    resList[[x]] <- Sig_fn(resList[[x]], x)
    shr_resList[[x]] <- Sig_fn(shr_resList[[x]], x)
    
}

Exploratory data analysis of the extracted log2FoldChange tables

# No shrinkage summary
summary(resList)
##        Length Class      Mode
## TPM    9      data.frame list
## Counts 9      data.frame list
head(resList[['TPM']])
##      Gene   baseMean log2FoldChange     lfcSE       stat    pvalue      padj
## 1   A2MP1  3.0525639    -0.71117275 1.7535703 -0.4055570 0.6850681        NA
## 2 A3GALT2  0.0000000             NA        NA         NA        NA        NA
## 3   A4GNT  0.8296737    -1.91297083 3.0641144 -0.6243144 0.5324211        NA
## 4  AACSP1 13.9670247    -0.07686924 0.7514137 -0.1022995 0.9185190 0.9811568
## 5 AADACL2  0.0000000             NA        NA         NA        NA        NA
## 6 AADACL4  0.9621048    -0.34400388 2.9069797 -0.1183372 0.9058005        NA
##     FDR Input
## 1 > 0.1   TPM
## 2 > 0.1   TPM
## 3 > 0.1   TPM
## 4 > 0.1   TPM
## 5 > 0.1   TPM
## 6 > 0.1   TPM
head(resList[['Counts']])
##      Gene   baseMean log2FoldChange     lfcSE       stat    pvalue      padj
## 1   A2MP1  2.8317018     -0.8410796 1.8117650 -0.4642322 0.6424814        NA
## 2 A3GALT2  0.0000000             NA        NA         NA        NA        NA
## 3   A4GNT  0.8395428     -1.9116000 3.0121489 -0.6346300 0.5256698        NA
## 4  AACSP1 13.9323130     -0.1237197 0.7552029 -0.1638231 0.8698704 0.9661114
## 5 AADACL2  0.0000000             NA        NA         NA        NA        NA
## 6 AADACL4  0.9757540     -0.3681081 2.8561649 -0.1288819 0.8974511        NA
##     FDR  Input
## 1 > 0.1 Counts
## 2 > 0.1 Counts
## 3 > 0.1 Counts
## 4 > 0.1 Counts
## 5 > 0.1 Counts
## 6 > 0.1 Counts
# Shrinkage summary
summary(shr_resList)
##        Length Class      Mode
## TPM    9      data.frame list
## Counts 9      data.frame list
head(shr_resList[['TPM']])
##      Gene   baseMean log2FoldChange     lfcSE       stat    pvalue      padj
## 1   A2MP1  3.0525639    -0.13588716 0.3372049 -0.4055570 0.6850681        NA
## 2 A3GALT2  0.0000000             NA        NA         NA        NA        NA
## 3   A4GNT  0.8296737    -0.12996851 0.2230905 -0.6243144 0.5324211        NA
## 4  AACSP1 13.9670247    -0.04328458 0.4236828 -0.1022995 0.9185190 0.9811568
## 5 AADACL2  0.0000000             NA        NA         NA        NA        NA
## 6 AADACL4  0.9621048    -0.02659239 0.2313921 -0.1183372 0.9058005        NA
##     FDR Input
## 1 > 0.1   TPM
## 2 > 0.1   TPM
## 3 > 0.1   TPM
## 4 > 0.1   TPM
## 5 > 0.1   TPM
## 6 > 0.1   TPM
head(shr_resList[['Counts']])
##      Gene   baseMean log2FoldChange     lfcSE       stat    pvalue      padj
## 1   A2MP1  2.8317018    -0.14479543 0.3333802 -0.4642322 0.6424814        NA
## 2 A3GALT2  0.0000000             NA        NA         NA        NA        NA
## 3   A4GNT  0.8395428    -0.13464596 0.2270494 -0.6346300 0.5256698        NA
## 4  AACSP1 13.9323130    -0.06903748 0.4245107 -0.1638231 0.8698704 0.9661114
## 5 AADACL2  0.0000000             NA        NA         NA        NA        NA
## 6 AADACL4  0.9757540    -0.02945511 0.2355359 -0.1288819 0.8974511        NA
##     FDR  Input
## 1 > 0.1 Counts
## 2 > 0.1 Counts
## 3 > 0.1 Counts
## 4 > 0.1 Counts
## 5 > 0.1 Counts
## 6 > 0.1 Counts

Exploring mean-difference relationship with MA plots

- Reference: DESeq2 doc “MA-plot”

# Set ylim: has to adjusted by users depending on data 
yl <- c(-20, 20)
# Set min log2 fold change of interest 
mLog <- c(-1, 1)
# Define a function creating an MA plot
MA_fn <- function(List, Shr) {
    MAList <- ddsList 
    for (i in 1:2) {
        MAplot <- ggplot(List[[i]], 
                         aes(x=baseMean,
                             y=log2FoldChange,
                             color=FDR)) + geom_point() + scale_x_log10() + theme_bw() + scale_color_manual(values=c("blue", "grey")) + ggtitle(paste("MA plot:", names(List)[i], "Input with", Shr)) + ylim(yl[1], yl[2])+ geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red")
        MAList[[i]] <- MAplot
    }
    return(MAList)
}
# Create MA plots with or without shrinkage and store in a list
MA <- MA_fn(resList, "No Shrinkage")
shr_MA <- MA_fn(shr_resList, "Shrinkage")

Displaying MA plots

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

- Upper: TPM with (right) or without (left) shrinkage

- Lower: Counts with (right) or without (left) shrinkage

# TPM with or without shrinkage
grid.arrange(MA[[1]], shr_MA[[1]], nrow=1)

# TPM with or without shrinkage
grid.arrange(MA[[2]], shr_MA[[2]], nrow=1)

Exploring distribution of false discovery rate (FDR)

- Distribution of adjusted p-val (FDR) was presented

- x-axis: FDR

- y-axis: Number of genes

- Black dashed line: FDR = 0.1

# Combining total data table 
res <- rbind(shr_resList[['TPM']], shr_resList[['Counts']])
res$Input <- factor(res$Input, levels=TvC)  # TvC=c("TPM", "Counts")
# Create a plot presenting distribution of FDR
ggplot(res,
       aes(x=padj, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") + 
xlab("Adjusted P-Value (FDR)") + 
ylab("Count") + 
geom_vline(xintercept=alpha, 
           color="black", 
           linetype="dashed",
           size=1) + 
scale_x_continuous(breaks=seq(0, 1, by=0.1)) 

Volcano plots

- Black dashed lines: log2FoldChange = -1 and 1

- x-axis: gene expression level (log2FoldChange)

- y-axis: number of genes

# Set xlim for volcano plots
xlim=c(-6, 6)    # has to be assined by users
# Set a basic volcano plot function 
Volcano_fn <- function(df, Label=NULL) {
ggplot(df, 
       aes(x=log2FoldChange,
           y= -log10(padj),
           color=FDR,
           label=Label)) + 
geom_point() +
facet_grid(.~Input) + 
theme_bw() +
scale_color_manual(values=c("blue", "grey")) + 
ggtitle("Volcano Plot") + 
ylab("-log10(padj)") + 
theme(strip.text.x=element_text(size=12)) + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="red", 
           linetype="dashed", 
           size=1) + 
xlim(xlim[1], xlim[2])
}
# Display the volcano plots by input
Volcano_fn(res)

Volcano plots with promising genes

  • Log odds threshold (y-axis): > 20
# Assign log odds threshold 
LogOddsCut=20 
# Add a column indicating high log odds genes 
res <- res %>% 
    mutate(Label=ifelse(-log10(padj) > LogOddsCut, 
                                   Gene, 
                                   "")) 
# Display the genes with volcano plots
Volcano_fn(res, Label=res$Label) + geom_text_repel(color="black")

Exploring distribution of log2FoldChange by input type

- Black dashed lines: log2FoldChange = -1 and 1

- x-axis: gene expression level (log2FoldChange)

- y-axis: number of genes

ggplot(res[res$FDR == "< 0.1", ],  # Subset rows with FDR < alpha 
       aes(x=log2FoldChange,
           color=Input)) + 
geom_density(size=1, aes(y=..count..)) +
theme_bw() + 
ylab("Count") + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="black",
           linetype="dashed", size=1) +
ggtitle("Distribution of Log2 Folds by Input Type") + 
xlim(xlim[1], xlim[2])

Exploring expression profiling with normalized count data

- Normalized count matrices are extracted from dds objects and filtered with thresholds set at FDR and log2FoldChange

- The heatmaps display z-scores of the normalized counts

- lowfdrList: a list of matrices filtered by FDR < alpha

- highfoldList: a list of matrices filtered by FDR < alpha AND absolute log2FoldChange > user’s minimum threshold (mLog)

- In this analysis, mLog = 1

- References: Harvard Chan Bioinformatics Core workshop, DESeq2 doc “Heatmap of the count matrix”

# Initialize a list 
lowfdrList <- ddsList   # A list for normalized counts matrix with FDR below alpha
highfoldList <- ddsList  # A list for normalized counts with log2foldchange over minLog
for (x in TvC) {
    # Create filtering vectors with alpha and log2foldchange
    BelowAlpha <- shr_resList[[x]]$FDR == FDRv[1]
    overmLog <- abs(shr_resList[[x]]$log2FoldChange) > mLog[2]  # mLog has been set to c(-1, 1) previously
    # Extract transformed counts from vsd objects (TPM[['vsd']] and Counts[['vsd']]) 
    if (x == "TPM") {
        normCounts <- counts(TPM[['dds']], normalized=T)
    
    } else {
        normCounts <- counts(Counts[['dds']], normalized=T)
    }
    
    # Update the normalized count matrix with FDR below alpha
    lowfdrList[[x]] <- normCounts[BelowAlpha, ]
    highfoldList[[x]] <- normCounts[BelowAlpha & overmLog, ]
    summary(lowfdrList[[x]])
    summary(highfoldList[[x]])
}
# Initialize map lists 
lowfdrMap <- ddsList
highfoldMap <- ddsList 
# Set a function creating a heatmap
ProfileHeatmap_fn <- function(inputmatrix, Title1, Title2, Title3=NULL) {
    
    as.ggplot(pheatmap(inputmatrix, 
             annotation=HeatmapAnno,
             scale="row",         # presents z-score instead of counts
             show_rownames=F,
             main=paste("Transcription Profiles with", 
                        Title1, 
                        "input and", 
                        Title2, 
                        alpha, 
                        Title3)))
}
# Create and save heatmaps
for (x in TvC) {
    lowfdrMap[[x]] <- ProfileHeatmap_fn(lowfdrList[[x]],
                                        Title1=x, 
                                        Title2="FDR <")
    highfoldMap[[x]] <- ProfileHeatmap_fn(highfoldList[[x]], 
                                          Title1=x, 
                                          Title2="FDR <",
                                          Title3=paste("+ Absolte Log2 Fold Change >", mLog[2])) 
}

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 
NAstat <- res %>%
    group_by(Input) %>%
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>%
    mutate(total=sum(zero, outlier)) %>%
    gather(Type, Number, -Input) %>%
    mutate(Type=factor(case_when(Type == "zero" ~ type[1], 
                                 Type == "outlier" ~ type[2], 
                                 Type == "total" ~ type[3]), 
                       levels=type))
# Plot number of NA genes 
ggplot(NAstat, aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

Ranking DEGs with the TPM and original count inputs

- FDRrankList: ranked by FDR

- lfcList: ranked by absolute fold change

- UPlfcList: ranked by magnitude of fold change increase

- DOWNlfcList: ranked by manitude of fold change decrease

# Create a new list having DE table with FDR below alpha
lowfdr_resList <- shr_resList 
for (x in TvC) { 
    lowfdr_resList[[x]] <- filter(shr_resList[[x]], 
                                  FDR == FDRv[1]) %>% 
    as.data.table()
}
# Initialize new lists in order to store rank-updated result DE tables 
FDRrankList <- lowfdr_resList
lfcList <- lowfdr_resList
UPlfcList <- lowfdr_resList
DOWNlfcList <- lowfdr_resList
# Set a function creating a column for the rank
Ranking_fn <- function(x) {mutate(x, Rank=1:nrow(x))}
for (x in TvC) { 
    # Rearrange genes with FDR  
    FDRrankList[[x]] <- lowfdr_resList[[x]][order(padj),] %>%
        Ranking_fn()
    # Rearrange genew with absolute log2FoldChange 
    lfcList[[x]] <- lowfdr_resList[[x]][order(-abs(log2FoldChange)),] %>%
        Ranking_fn()
    # Rearrange genes with log2FoldChange (decreasing order)
    UPlfcList[[x]] <- lowfdr_resList[[x]][order(-log2FoldChange),] %>%
        Ranking_fn()
    # Rearrange genes with log2FoldChange (increasing order)
    DOWNlfcList[[x]] <- lowfdr_resList[[x]][order(log2FoldChange),] %>%
        Ranking_fn()
}
# Explore the ranks
print(c(FDRrankList, lfcList, UPlfcList, DOWNlfcList))
## $TPM
##         Gene    baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:    CCN2  3782.15461      3.3138239 0.1852169 17.879269 1.710657e-71
##   2:     F2R  2460.48267      1.9146387 0.1209837 15.821943 2.196087e-56
##   3:   CDCP1  1500.91438      2.6204056 0.1822811 14.363152 8.812579e-47
##   4:  STK38L 11846.66658      1.8911153 0.1338459 14.128446 2.536811e-45
##   5: TM4SF18  1895.68247      1.9727888 0.1415730 13.930193 4.152312e-44
##  ---                                                                    
## 519:  MRPL51  1960.13310      0.3127969 0.1272269  2.458574 1.394902e-02
## 520: L3MBTL3   291.07491     -0.4897894 0.1994004 -2.456223 1.404062e-02
## 521:  SDHAF3  2795.76859     -0.3085702 0.1257367 -2.454098 1.412385e-02
## 522:  HEXIM1   337.54914     -0.4614688 0.1882160 -2.451750 1.421635e-02
## 523: TMSB15B    77.90622      0.6931845 0.2827217  2.449988 1.428610e-02
##              padj   FDR Input Rank
##   1: 6.249030e-68 < 0.1   TPM    1
##   2: 4.011153e-53 < 0.1   TPM    2
##   3: 1.073078e-43 < 0.1   TPM    3
##   4: 2.316742e-42 < 0.1   TPM    4
##   5: 3.033679e-41 < 0.1   TPM    5
##  ---                              
## 519: 9.818068e-02 < 0.1   TPM  519
## 520: 9.863534e-02 < 0.1   TPM  520
## 521: 9.902957e-02 < 0.1   TPM  521
## 522: 9.948720e-02 < 0.1   TPM  522
## 523: 9.978415e-02 < 0.1   TPM  523
## 
## $Counts
##         Gene    baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:    CCN2  3838.81312      3.3117030 0.1858269 17.809254 5.990320e-71
##   2:     F2R  2498.11791      1.9128696 0.1211696 15.783102 4.067010e-56
##   3:   CDCP1  1523.30246      2.6177630 0.1826892 14.316349 1.729741e-46
##   4:  STK38L 12016.77780      1.8894025 0.1345047 14.046514 8.092088e-45
##   5: TM4SF18  1919.12630      1.9716534 0.1416724 13.912413 5.325183e-44
##  ---                                                                    
## 527:  CUTALP    80.56840     -0.6867126 0.2798177 -2.451451 1.422816e-02
## 528:  SDHAF3  2839.34437     -0.3094651 0.1263165 -2.449916 1.428894e-02
## 529:   CREG1   273.30711     -0.4547095 0.1859437 -2.445053 1.448308e-02
## 530:  P2RY13    74.69163     -0.6867880 0.2807198 -2.444416 1.450869e-02
## 531:   RBM43    33.32318     -0.8843602 0.3610070 -2.444555 1.450309e-02
##              padj   FDR  Input Rank
##   1: 2.188863e-67 < 0.1 Counts    1
##   2: 7.430427e-53 < 0.1 Counts    2
##   3: 2.106825e-43 < 0.1 Counts    3
##   4: 7.392122e-42 < 0.1 Counts    4
##   5: 3.891644e-41 < 0.1 Counts    5
##  ---                               
## 527: 9.867185e-02 < 0.1 Counts  527
## 528: 9.888595e-02 < 0.1 Counts  528
## 529: 9.983948e-02 < 0.1 Counts  529
## 530: 9.983948e-02 < 0.1 Counts  530
## 531: 9.983948e-02 < 0.1 Counts  531
## 
## $TPM
##           Gene    baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:      EDN1   141.58628      3.4835855 0.3262539 10.347843 4.280208e-25
##   2:      CCN2  3782.15461      3.3138239 0.1852169 17.879269 1.710657e-71
##   3:     GPR87   526.04952      3.2831219 0.2391306 13.655049 1.883633e-42
##   4:     MUC12    31.29951      3.0505542 0.4135533  5.990845 2.087531e-09
##   5:  SERPINE1  8891.97515      2.8509705 0.3263039  8.759388 1.963139e-18
##  ---                                                                      
## 519:     PPIL4  3537.05463      0.3036901 0.1171944  2.591328 9.560620e-03
## 520:       ND1 35591.46112     -0.2990627 0.1045701 -2.859924 4.237422e-03
## 521:  EIF4EBP2  4471.66924      0.2985887 0.1171230  2.549363 1.079198e-02
## 522:     SF3B6  4172.24628      0.2930706 0.1117374  2.622851 8.719732e-03
## 523: RPL36AP37    15.01076      0.1506778 0.1855742  5.300249 1.156447e-07
##              padj   FDR Input Rank
##   1: 1.563560e-22 < 0.1   TPM    1
##   2: 6.249030e-68 < 0.1   TPM    2
##   3: 1.146819e-39 < 0.1   TPM    3
##   4: 1.229960e-07 < 0.1   TPM    4
##   5: 3.585673e-16 < 0.1   TPM    5
##  ---                              
## 519: 7.446683e-02 < 0.1   TPM  519
## 520: 3.989511e-02 < 0.1   TPM  520
## 521: 8.128477e-02 < 0.1   TPM  521
## 522: 6.985347e-02 < 0.1   TPM  522
## 523: 4.912212e-06 < 0.1   TPM  523
## 
## $Counts
##           Gene    baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:      EDN1   143.61165      3.4934078 0.3252202 10.406217 2.322691e-25
##   2:      CCN2  3838.81312      3.3117030 0.1858269 17.809254 5.990320e-71
##   3:     GPR87   533.54706      3.2823931 0.2390153 13.656904 1.836253e-42
##   4:     ALPK2    44.93538      3.1746617 0.4192791  5.808344 6.309395e-09
##   5:     MUC12    19.79726      2.9517456 0.4237826  6.511310 7.449834e-11
##  ---                                                                      
## 527:     PPIL4  3591.69222      0.3024157 0.1176344  2.570801 1.014635e-02
## 528:       ND1 36140.65876     -0.3001565 0.1054619 -2.846112 4.425659e-03
## 529:  EIF4EBP2  4540.89362      0.2974076 0.1179816  2.520798 1.170890e-02
## 530:     SF3B6  4236.85234      0.2917483 0.1123824  2.596033 9.430688e-03
## 531: RPL36AP37    15.15450      0.1511406 0.1861197  5.298424 1.168063e-07
##              padj   FDR  Input Rank
##   1: 8.487113e-23 < 0.1 Counts    1
##   2: 2.188863e-67 < 0.1 Counts    2
##   3: 1.118278e-39 < 0.1 Counts    3
##   4: 3.440975e-07 < 0.1 Counts    4
##   5: 5.917760e-09 < 0.1 Counts    5
##  ---                               
## 527: 7.723704e-02 < 0.1 Counts  527
## 528: 4.094014e-02 < 0.1 Counts  528
## 529: 8.660795e-02 < 0.1 Counts  529
## 530: 7.316291e-02 < 0.1 Counts  530
## 531: 4.962910e-06 < 0.1 Counts  531
## 
## $TPM
##            Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:       EDN1  141.58628       3.483585 0.3262539 10.347843 4.280208e-25
##   2:       CCN2 3782.15461       3.313824 0.1852169 17.879269 1.710657e-71
##   3:      GPR87  526.04952       3.283122 0.2391306 13.655049 1.883633e-42
##   4:      MUC12   31.29951       3.050554 0.4135533  5.990845 2.087531e-09
##   5:   SERPINE1 8891.97515       2.850970 0.3263039  8.759388 1.963139e-18
##  ---                                                                      
## 519:     TRMT9B  145.07064      -1.716032 0.3049279 -5.610108 2.021999e-08
## 520:    TBC1D3L   28.55182      -1.742442 0.3813485 -4.499991 6.795649e-06
## 521:     CTXND2  168.84610      -1.817134 0.2652655 -6.827901 8.616600e-12
## 522:    WFIKKN2  371.00721      -1.937066 0.2298615 -8.412662 4.008097e-17
## 523: MROH7-TTC4   19.77094      -2.336210 0.4243946 -5.001629 5.684777e-07
##              padj   FDR Input Rank
##   1: 1.563560e-22 < 0.1   TPM    1
##   2: 6.249030e-68 < 0.1   TPM    2
##   3: 1.146819e-39 < 0.1   TPM    3
##   4: 1.229960e-07 < 0.1   TPM    4
##   5: 3.585673e-16 < 0.1   TPM    5
##  ---                              
## 519: 9.981573e-07 < 0.1   TPM  519
## 520: 1.812008e-04 < 0.1   TPM  520
## 521: 7.869110e-10 < 0.1   TPM  521
## 522: 6.365904e-15 < 0.1   TPM  522
## 523: 2.185946e-05 < 0.1   TPM  523
## 
## $Counts
##            Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1:       EDN1  143.61165       3.493408 0.3252202 10.406217 2.322691e-25
##   2:       CCN2 3838.81312       3.311703 0.1858269 17.809254 5.990320e-71
##   3:      GPR87  533.54706       3.282393 0.2390153 13.656904 1.836253e-42
##   4:      ALPK2   44.93538       3.174662 0.4192791  5.808344 6.309395e-09
##   5:      MUC12   19.79726       2.951746 0.4237826  6.511310 7.449834e-11
##  ---                                                                      
## 527:     TRMT9B  145.42958      -1.727955 0.3060439 -5.633807 1.762745e-08
## 528:    TBC1D3L   29.13473      -1.733263 0.3769001 -4.528622 5.936973e-06
## 529:     CTXND2  171.45399      -1.825347 0.2641534 -6.887617 5.673462e-12
## 530:    WFIKKN2  376.78945      -1.939575 0.2292372 -8.446372 3.004952e-17
## 531: MROH7-TTC4   20.24325      -2.216531 0.4251913 -4.950916 7.386496e-07
##              padj   FDR  Input Rank
##   1: 8.487113e-23 < 0.1 Counts    1
##   2: 2.188863e-67 < 0.1 Counts    2
##   3: 1.118278e-39 < 0.1 Counts    3
##   4: 3.440975e-07 < 0.1 Counts    4
##   5: 5.917760e-09 < 0.1 Counts    5
##  ---                               
## 527: 8.588092e-07 < 0.1 Counts  527
## 528: 1.583482e-04 < 0.1 Counts  528
## 529: 5.315597e-10 < 0.1 Counts  529
## 530: 4.773954e-15 < 0.1 Counts  530
## 531: 2.726289e-05 < 0.1 Counts  531
## 
## $TPM
##            Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1: MROH7-TTC4   19.77094      -2.336210 0.4243946 -5.001629 5.684777e-07
##   2:    WFIKKN2  371.00721      -1.937066 0.2298615 -8.412662 4.008097e-17
##   3:     CTXND2  168.84610      -1.817134 0.2652655 -6.827901 8.616600e-12
##   4:    TBC1D3L   28.55182      -1.742442 0.3813485 -4.499991 6.795649e-06
##   5:     TRMT9B  145.07064      -1.716032 0.3049279 -5.610108 2.021999e-08
##  ---                                                                      
## 519:   SERPINE1 8891.97515       2.850970 0.3263039  8.759388 1.963139e-18
## 520:      MUC12   31.29951       3.050554 0.4135533  5.990845 2.087531e-09
## 521:      GPR87  526.04952       3.283122 0.2391306 13.655049 1.883633e-42
## 522:       CCN2 3782.15461       3.313824 0.1852169 17.879269 1.710657e-71
## 523:       EDN1  141.58628       3.483585 0.3262539 10.347843 4.280208e-25
##              padj   FDR Input Rank
##   1: 2.185946e-05 < 0.1   TPM    1
##   2: 6.365904e-15 < 0.1   TPM    2
##   3: 7.869110e-10 < 0.1   TPM    3
##   4: 1.812008e-04 < 0.1   TPM    4
##   5: 9.981573e-07 < 0.1   TPM    5
##  ---                              
## 519: 3.585673e-16 < 0.1   TPM  519
## 520: 1.229960e-07 < 0.1   TPM  520
## 521: 1.146819e-39 < 0.1   TPM  521
## 522: 6.249030e-68 < 0.1   TPM  522
## 523: 1.563560e-22 < 0.1   TPM  523
## 
## $Counts
##            Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
##   1: MROH7-TTC4   20.24325      -2.216531 0.4251913 -4.950916 7.386496e-07
##   2:    WFIKKN2  376.78945      -1.939575 0.2292372 -8.446372 3.004952e-17
##   3:     CTXND2  171.45399      -1.825347 0.2641534 -6.887617 5.673462e-12
##   4:    TBC1D3L   29.13473      -1.733263 0.3769001 -4.528622 5.936973e-06
##   5:     TRMT9B  145.42958      -1.727955 0.3060439 -5.633807 1.762745e-08
##  ---                                                                      
## 527:      MUC12   19.79726       2.951746 0.4237826  6.511310 7.449834e-11
## 528:      ALPK2   44.93538       3.174662 0.4192791  5.808344 6.309395e-09
## 529:      GPR87  533.54706       3.282393 0.2390153 13.656904 1.836253e-42
## 530:       CCN2 3838.81312       3.311703 0.1858269 17.809254 5.990320e-71
## 531:       EDN1  143.61165       3.493408 0.3252202 10.406217 2.322691e-25
##              padj   FDR  Input Rank
##   1: 2.726289e-05 < 0.1 Counts    1
##   2: 4.773954e-15 < 0.1 Counts    2
##   3: 5.315597e-10 < 0.1 Counts    3
##   4: 1.583482e-04 < 0.1 Counts    4
##   5: 8.588092e-07 < 0.1 Counts    5
##  ---                               
## 527: 5.917760e-09 < 0.1 Counts  527
## 528: 3.440975e-07 < 0.1 Counts  528
## 529: 1.118278e-39 < 0.1 Counts  529
## 530: 2.188863e-67 < 0.1 Counts  530
## 531: 8.487113e-23 < 0.1 Counts  531

Comparing DEG ranks between TPM- and Counts-inputted DE analysis

# Set a function rebuilding DE tables with gene ranks 
combineTable_fn <- function(List){
    # Select columns and join the data frames by gene
    full_join(List[[TvC[1]]][,.(Gene, Input, Rank, baseMean)], 
              List[[TvC[2]]][,.(Gene, Input, Rank, baseMean)], by="Gene") %>%
    
    # Add columns assining gene expression levels, rank differences, and mean ranks
    mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
           RankDiff=Rank.x-Rank.y, 
           MeanRank=(Rank.x+Rank.y)/2)
} 
# Explore outputs of the function
head(combineTable_fn(FDRrankList))
##       Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1:    CCN2     TPM      1  3782.1546  Counts      1  3838.8131
## 2:     F2R     TPM      2  2460.4827  Counts      2  2498.1179
## 3:   CDCP1     TPM      3  1500.9144  Counts      3  1523.3025
## 4:  STK38L     TPM      4 11846.6666  Counts      4 12016.7778
## 5: TM4SF18     TPM      5  1895.6825  Counts      5  1919.1263
## 6:   GPR87     TPM      6   526.0495  Counts      6   533.5471
##    logMeanExpression RankDiff MeanRank
## 1:          8.648495        0        1
## 2:          8.218664        0        2
## 3:          7.724255        0        3
## 4:          9.790042        0        4
## 5:          7.956913        0        5
## 6:          6.675600        0        6
dim(combineTable_fn(FDRrankList))
## [1] 535  10
tail(combineTable_fn(FDRrankList))
##      Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y logMeanExpression
## 1:   UTP3    <NA>     NA         NA  Counts    524  308.86592                NA
## 2:   IGIP    <NA>     NA         NA  Counts    525   90.33876                NA
## 3: CUTALP    <NA>     NA         NA  Counts    527   80.56840                NA
## 4:  CREG1    <NA>     NA         NA  Counts    529  273.30711                NA
## 5: P2RY13    <NA>     NA         NA  Counts    530   74.69163                NA
## 6:  RBM43    <NA>     NA         NA  Counts    531   33.32318                NA
##    RankDiff MeanRank
## 1:       NA       NA
## 2:       NA       NA
## 3:       NA       NA
## 4:       NA       NA
## 5:       NA       NA
## 6:       NA       NA
# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
compareRanks_fn <- function(df, rankedby) {
ggplot(df, 
       aes(x=Rank.x, 
           y=Rank.y,
           color=logMeanExpression)) +
geom_point(alpha=0.5) + 
theme_bw() + 
xlab("Rank with TPM") + 
ylab("Rank with Counts") + 
geom_abline(slope=1, color="black", size=0.5) + 
ggtitle(paste(rankedby, "Ranking with TPM vs Counts Inputs")) + 
scale_color_gradient(low="blue", high="red")
}
   
# Set a function plotting the rank difference over the gene expression level
RankdiffOverMean_fn <- function(df, rankedby) {
ggplot(df, 
       aes(x=logMeanExpression, 
           y=RankDiff,
           color=MeanRank)) +
geom_point(alpha=0.5) + 
theme_bw() + 
ylab("Rank Difference (TPM - Counts)") +
ggtitle(paste("Rank Difference Inputs (TPM - Counts) in", rankedby)) + 
geom_hline(yintercept=0, color="black", size=0.5) + 
scale_color_gradient(low="blue", high="red")
}

Visualizing DEG ranks I: TPM- vs Counts-input

- x-axis: rank with TPM input

- y-axis: rank with Counts input

- Black diagonal lines: rank with TPM = rank with Counts

- Dot color: gene expression level (log-baseMean)

# Ranked by FDR
compareRanks_fn(combineTable_fn(FDRrankList), 
                "FDR")

# Ranked by absolute fold change 
compareRanks_fn(combineTable_fn(lfcList), 
                "Absolute Log2FoldChange")

# Ranked by magnitude of positive fold change
compareRanks_fn(combineTable_fn(UPlfcList), 
                "Log2FoldChange (Increased)")

# Ranked by magnitude of negative fold change
compareRanks_fn(combineTable_fn(DOWNlfcList), 
                "Log2FoldChange (Decreased)")

Visualizing DEG ranks II: Relationship between gene expression level and rank difference

- x-axis: expression level (log-baseMean)

- y-axis: rank difference (rank with TPM - rank with Counts)

- Black horizontal lines: rank with TPM = rank with Counts

- Dot color: average rank

# Ranked by FDR
RankdiffOverMean_fn(combineTable_fn(FDRrankList), 
                "FDR")

# Ranked by absolute fold change 
RankdiffOverMean_fn(combineTable_fn(lfcList), 
                "Absolute Log2FoldChange")

# Ranked by magnitude of positive fold change
RankdiffOverMean_fn(combineTable_fn(UPlfcList), 
                "Log2FoldChange (Increased)")

# Ranked by magnitude of negative fold change
RankdiffOverMean_fn(combineTable_fn(DOWNlfcList), 
                "Log2FoldChange (Decreased)")

Summarizing up/down DEGs with an upset plot

- Calculate the number of genes

# Clean data to generate an upset plot
res <- res %>%
    # Filter genes with valid padj 
    filter(!is.na(padj)) %>% 
    # Detect genes which are up/down/unchanged change patterns in either TPM and Counts inputs
    mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes? 
           Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""),  # What are downregulated genes? 
           Unchanged=ifelse(FDR == FDRv[2], Gene, ""),   # What are unchanged genes? 
           TPM_Input=ifelse(Input == "TPM", Gene, ""),   # What are the genes from TPM input? 
           Counts_Input=ifelse(Input == "Counts", Gene, ""))   # What are the genes from Counts input?
# Create a list storing groups of interest
upsetInput <- list(Up=res$Up, 
                   Down=res$Down, 
                   Unchanged=res$Unchanged, 
                   TPM_Input=res$TPM, 
                   Counts_Input=res$Counts)
# Create the upset plot 
upset(fromList(upsetInput), 
      sets.x.label="Number of Genes per Group", 
      order.by="freq") 

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] AnnotationDbi_1.52.0        UpSetR_1.4.0               
##  [3] ggrepel_0.8.2               ggplotify_0.0.5            
##  [5] gridExtra_2.3               pheatmap_1.0.12            
##  [7] DESeq2_1.30.0               SummarizedExperiment_1.20.0
##  [9] Biobase_2.50.0              MatrixGenerics_1.2.0       
## [11] matrixStats_0.57.0          GenomicRanges_1.42.0       
## [13] GenomeInfoDb_1.26.2         IRanges_2.24.0             
## [15] S4Vectors_0.28.1            tximport_1.18.0            
## [17] forcats_0.5.0               stringr_1.4.0              
## [19] dplyr_1.0.2                 purrr_0.3.4                
## [21] readr_1.4.0                 tidyr_1.1.2                
## [23] tibble_3.0.4                ggplot2_3.3.2              
## [25] tidyverse_1.3.0             AnnotationHub_2.22.0       
## [27] BiocFileCache_1.14.0        dbplyr_2.0.0               
## [29] BiocGenerics_0.36.0         rmarkdown_2.5              
## [31] data.table_1.13.4          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0              ellipsis_0.3.1               
##  [3] XVector_0.30.0                fs_1.5.0                     
##  [5] rstudioapi_0.13               farver_2.0.3                 
##  [7] bit64_4.0.5                   interactiveDisplayBase_1.28.0
##  [9] fansi_0.4.1                   lubridate_1.7.9.2            
## [11] xml2_1.3.2                    splines_4.0.3                
## [13] geneplotter_1.68.0            knitr_1.30                   
## [15] jsonlite_1.7.2                broom_0.7.2                  
## [17] annotate_1.68.0               shiny_1.5.0                  
## [19] BiocManager_1.30.10           compiler_4.0.3               
## [21] httr_1.4.2                    rvcheck_0.1.8                
## [23] backports_1.2.1               assertthat_0.2.1             
## [25] Matrix_1.2-18                 fastmap_1.0.1                
## [27] cli_2.2.0                     later_1.1.0.1                
## [29] htmltools_0.5.0               tools_4.0.3                  
## [31] gtable_0.3.0                  glue_1.4.2                   
## [33] GenomeInfoDbData_1.2.4        rappdirs_0.3.1               
## [35] Rcpp_1.0.5                    cellranger_1.1.0             
## [37] vctrs_0.3.5                   xfun_0.19                    
## [39] ps_1.5.0                      rvest_0.3.6                  
## [41] mime_0.9                      lifecycle_0.2.0              
## [43] XML_3.99-0.5                  zlibbioc_1.36.0              
## [45] scales_1.1.1                  hms_0.5.3                    
## [47] promises_1.1.1                RColorBrewer_1.1-2           
## [49] yaml_2.2.1                    curl_4.3                     
## [51] memoise_1.1.0                 stringi_1.5.3                
## [53] RSQLite_2.2.1                 BiocVersion_3.12.0           
## [55] genefilter_1.72.0             BiocParallel_1.24.1          
## [57] rlang_0.4.9                   pkgconfig_2.0.3              
## [59] bitops_1.0-6                  evaluate_0.14                
## [61] lattice_0.20-41               labeling_0.4.2               
## [63] bit_4.0.4                     tidyselect_1.1.0             
## [65] plyr_1.8.6                    magrittr_2.0.1               
## [67] R6_2.5.0                      generics_0.1.0               
## [69] DelayedArray_0.16.0           DBI_1.1.0                    
## [71] pillar_1.4.7                  haven_2.3.1                  
## [73] withr_2.3.0                   survival_3.2-7               
## [75] RCurl_1.98-1.2                modelr_0.1.8                 
## [77] crayon_1.3.4                  locfit_1.5-9.4               
## [79] grid_4.0.3                    readxl_1.3.1                 
## [81] blob_1.2.1                    reprex_0.3.0                 
## [83] digest_0.6.27                 xtable_1.8-4                 
## [85] httpuv_1.5.4                  gridGraphics_0.5-0           
## [87] munsell_0.5.0